In [1]:
import yaml
import pandas as pd
from functools import reduce
import numpy as np

def get_config():
    with open("config.yaml", 'r') as stream:
        config = yaml.safe_load(stream)
    return config

config = get_config()
filepath = (config['datapath_as4'])
if not filepath.endswith('.csv'):
    raise Exception("incorrect file format")
df = pd.read_csv(filepath)

print("Percentage of missing data for each ariable:\n", 1-df.count()/float(df.shape[0]))

independent_vars = ['PhoneReach', 'PhoneTime', 'Tired', 'Breakfast']  # because they cause an effect
dependent_vars = ['Hours', 'Enough'] # these parameters are effected
print(f'Dependent vars: {dependent_vars}')
print(f'Independent vars: {independent_vars}')

df = df.dropna()
df['Hours'] = pd.to_numeric(df['Hours']).astype('int')

# validate Tired variable values
def limitTiredValues(row):  
    if row['Tired'] > 5:
        return 5
    elif row['Tired'] < 1:
        return 1
    return row['Tired']

df['Tiredness_value'] = df.apply(lambda row: limitTiredValues(row), axis=1)
df = df.drop('Tired', axis=1)

def convertYesNoToBool(row, column_name):
    if row[column_name] == 'Yes':
        return True
    elif row[column_name] == 'No':
        return False
    raise Exception("incorrect patient answer")

def covertToBoolColumn(old_column_name, new_column_name):
    global df
    df[new_column_name] = df.apply(lambda row: convertYesNoToBool(row, old_column_name), axis=1)
    df = df.drop(old_column_name, axis=1)
    
covertToBoolColumn('Enough', 'EnoughSleep')
covertToBoolColumn('PhoneReach', 'InPhoneReach')
covertToBoolColumn('PhoneTime', 'UsedPhoneBeforeSleep')
# covertToBoolColumn('Breakfast', 'HadBreakfast')

df.info()
Percentage of missing data for each ariable:
 Enough        0.000000
Hours         0.019231
PhoneReach    0.000000
PhoneTime     0.000000
Tired         0.000000
Breakfast     0.000000
dtype: float64
Dependent vars: ['Hours', 'Enough']
Independent vars: ['PhoneReach', 'PhoneTime', 'Tired', 'Breakfast']
<class 'pandas.core.frame.DataFrame'>
Int64Index: 102 entries, 0 to 103
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Hours                 102 non-null    int64 
 1   Breakfast             102 non-null    object
 2   Tiredness_value       102 non-null    int64 
 3   EnoughSleep           102 non-null    bool  
 4   InPhoneReach          102 non-null    bool  
 5   UsedPhoneBeforeSleep  102 non-null    bool  
dtypes: bool(3), int64(2), object(1)
memory usage: 3.5+ KB
In [2]:
def hadBreakfast(row):  
    if row['Breakfast'] == 'Yes':
        return "had breakfast"
    return "no breakfast"

df['Breakfast'] = df.apply(lambda row: hadBreakfast(row), axis=1)
df.head()
Out[2]:
Hours Breakfast Tiredness_value EnoughSleep InPhoneReach UsedPhoneBeforeSleep
0 8 had breakfast 3 True True True
1 6 no breakfast 3 False True True
2 6 had breakfast 2 True True True
3 7 no breakfast 4 False True True
4 7 had breakfast 2 False True True
In [3]:
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, CategoricalColorMapper
from bokeh.io import output_notebook
from bokeh.resources import INLINE
output_notebook(INLINE)

source = ColumnDataSource(
        data = {'Hours': df['Hours'], 'Tiredness_value': df['Tiredness_value'], 'Breakfast': df['Breakfast']}
    )

palette=['green', 'red']
mapper = CategoricalColorMapper( factors=['had breakfast', 'no breakfast'], palette=palette)
pl = figure(plot_width=800, plot_height=400, title='The relation between being tired and hours of sleep', x_axis_label='Hours of sleep', y_axis_label='Tiredness value')
pl.scatter('Hours', 'Tiredness_value', source=source, size=9, alpha = 0.3, color={'field': 'Breakfast', 'transform': mapper}, legend_field='Breakfast')

show(pl)
Loading BokehJS ...
In [4]:
mean = df[['Hours', 'Tiredness_value']].mean(numeric_only=True)
print(f'Mean of target parameters:\n{mean}\n')

std = df[['Hours', 'Tiredness_value']].std(numeric_only=True)
print(f'Standart deviation of target parameters:\n{std}\n')

df_no_breakfast = df[df['Breakfast'] == 'no breakfast']
df_had_breakfast = df[df['Breakfast'] == 'had breakfast']

mean = df_had_breakfast[['Hours', 'Tiredness_value']].mean(numeric_only=True)
print(f'Mean of target parameters (had breakfast):\n{mean}\n')

std = df_had_breakfast[['Hours', 'Tiredness_value']].std(numeric_only=True)
print(f'Standart deviation of target parameters (had breakfast):\n{std}\n')

mean = df_no_breakfast[['Hours', 'Tiredness_value']].mean(numeric_only=True)
print(f'Mean of target parameters (no breakfast):\n{mean}\n')

std = df_no_breakfast[['Hours', 'Tiredness_value']].std(numeric_only=True)
print(f'Standart deviation of target parameters (no breakfast):\n{std}')

# easy way to watch statistics
df.describe()

# we can see that patients who had breakfast sleep more hours on the average (6.9h) 
# compared to patients who didn't have breakfast (6.2h) 

# patients who had breakfast are less tired on the average (2.8) 
# compared to patients who didn't have breakfast (3.3)
Mean of target parameters:
Hours              6.656863
Tiredness_value    3.088235
dtype: float64

Standart deviation of target parameters:
Hours              1.417676
Tiredness_value    1.015747
dtype: float64

Mean of target parameters (had breakfast):
Hours              6.918033
Tiredness_value    2.885246
dtype: float64

Standart deviation of target parameters (had breakfast):
Hours              1.268793
Tiredness_value    0.950410
dtype: float64

Mean of target parameters (no breakfast):
Hours              6.268293
Tiredness_value    3.390244
dtype: float64

Standart deviation of target parameters (no breakfast):
Hours              1.549587
Tiredness_value    1.045898
dtype: float64
Out[4]:
Hours Tiredness_value
count 102.000000 102.000000
mean 6.656863 3.088235
std 1.417676 1.015747
min 2.000000 1.000000
25% 6.000000 2.000000
50% 7.000000 3.000000
75% 7.000000 4.000000
max 10.000000 5.000000
In [5]:
from bokeh.models import BasicTicker, ColorBar, LinearColorMapper, PrintfTickFormatter, FixedTicker

source1_had_breakfast = ColumnDataSource(
        data = {'Hours': df_had_breakfast['Hours'], 'Tiredness_value': df_had_breakfast['Tiredness_value'], 'Breakfast': df_had_breakfast['Breakfast']}
    )

source2_no_breakfast = ColumnDataSource(
        data = {'Hours': df_no_breakfast['Hours'], 'Tiredness_value': df_no_breakfast['Tiredness_value'], 'Breakfast': df_no_breakfast['Breakfast']}
    )

p = figure(title="The relation between being tired and hours of sleep for patients who had breakfast",
           x_axis_location="above", width=800, height=400,
           toolbar_location='below',
           x_axis_label='Hours of sleep', y_axis_label='Tiredness value')

p.rect(x="Hours", y="Tiredness_value", width=1, height=1,
       source=source1_had_breakfast,
       fill_color={'field': 'Breakfast', 'transform': mapper},
       line_color=None, alpha=0.2)

p2 = figure(title="The relation between being tired and hours of sleep for patients who didn't have breakfast",
           x_axis_location="above", width=800, height=400,
           toolbar_location='below',
           x_axis_label='Hours of sleep', y_axis_label='Tiredness value')

p2.rect(x="Hours", y="Tiredness_value", width=1, height=1,
       source=source2_no_breakfast,
       fill_color={'field': 'Breakfast', 'transform': mapper},
       line_color=None, alpha=0.2)

# the majority of patients who had breafast sleeps 6-8 hours compared to patients who didn't have breafast 
# with an average sleep of 5-7 hours
show(p)
show(p2)
In [6]:
from scipy.stats import norm

arr_hist, edges=np.histogram(df['Hours'], bins=8,range=[2,10], density='True')
hours = pd.DataFrame({'Hours': arr_hist, 'left': edges[:-1], 'right': edges[1:]})

p = figure(plot_height = 800, plot_width = 800,  title = 'Hours of sleep distribution',x_axis_label = 'Hours of sleep', y_axis_label = 'Number of patients')

p.quad(bottom=0, top=hours['Hours'], 
      left=hours['left'], right=hours['right'], 
      fill_color='green', alpha=0.5, line_color='black')

# average
mean_hours_value = df[['Hours']].mean(numeric_only=True)
p.line(np.full(2,mean_hours_value), [0, hours['Hours'].max()], legend_label="Mean value", line_width=4, color='red')

#rubust estimation
median_hours_value = df[['Hours']].median(numeric_only=True)
p.line(np.full(2,median_hours_value), [0, hours['Hours'].max()], legend_label="Median value", line_width=4, color='black')

# normal distribution line
x = np.linspace(2, 10, 200)
mu_MM = mean_hours_value
sigma2_MM = df[['Hours']].var(numeric_only=True)
sigma_MM = np.sqrt(sigma2_MM)
rv = np.array([norm.pdf(xi, loc = mu_MM, scale = sigma_MM) for xi in x])
p.line(x, rv.ravel(), legend_label="Normal distribution line", line_width=4, color='blue')

# data is approximately normally distributed
show(p)
In [7]:
from scipy import stats
shapiro_test = stats.shapiro(df[['Hours']])

# the p-value is 7.15833084541373e-05 which is less than the alpha(0.05). It means that we have 
# sufficient evidence to say that sample does not come from a normal distribution.
shapiro_test
Out[7]:
ShapiroResult(statistic=0.93398118019104, pvalue=7.15833084541373e-05)
In [8]:
import matplotlib.pyplot as plt
import seaborn as sns
fig = plt.figure(figsize=(10, 8))
ax = sns.boxplot(x='Tiredness_value', y='Hours', data=df, color='#ffccff')
ax = sns.swarmplot(x="Tiredness_value", y="Hours", data=df, color='#660066', size=4)
plt.show()

fig = plt.figure(figsize=(10, 8))
ax = sns.boxplot(x='Breakfast', y='Hours', data=df, color='#ffccff')
ax = sns.swarmplot(x="Breakfast", y="Hours", data=df, color='#660066', size=4)
plt.show()
In [9]:
from scipy.stats import levene

groups_by_value_of_tiredness = df.groupby("Tiredness_value")

group_hours_1 = groups_by_value_of_tiredness.get_group(1)['Hours']
group_hours_2 = groups_by_value_of_tiredness.get_group(2)['Hours']
group_hours_3 = groups_by_value_of_tiredness.get_group(3)['Hours']
group_hours_4 = groups_by_value_of_tiredness.get_group(4)['Hours']
group_hours_5 = groups_by_value_of_tiredness.get_group(5)['Hours']

# P (0.0018190082469875675)< 0.05 - varience differs (plot shows the same result)
print("Levene test(hours~tiredness):\n", levene(group_hours_1, group_hours_2, group_hours_3, group_hours_4, group_hours_5))

groups_by_breakfast = df.groupby("Breakfast")

group_hours_yes = groups_by_breakfast.get_group("had breakfast")['Hours']
group_hours_no = groups_by_breakfast.get_group("no breakfast")['Hours']

# P (0.3270310216888861)> 0.05 - varience is equal
print("Levene test(hours~breakfast):\n", levene(group_hours_yes, group_hours_no))
Levene test(hours~tiredness):
 LeveneResult(statistic=4.634534448505077, pvalue=0.0018190082469875675)
Levene test(hours~breakfast):
 LeveneResult(statistic=0.9701052432875272, pvalue=0.3270310216888861)
In [10]:
def groupTirednessValues(row):  
    if row['Tiredness_value'] < 3:
        return 'no'
    elif row['Tiredness_value'] > 3:
        return 'yes'
    return 'maybe'

df['Tiredness_value_answer'] = df.apply(lambda row: groupTirednessValues(row), axis=1)

groupedByTireness = df.groupby('Tiredness_value_answer')
yes_group = groupedByTireness.get_group('yes')
no_group = groupedByTireness.get_group('no')
maybe_group = groupedByTireness.get_group('maybe')
common_size = min(len(yes_group), len(no_group),len(maybe_group))

equal_samle_sized_df = yes_group.head(common_size)
equal_samle_sized_df = pd.concat([equal_samle_sized_df, maybe_group.head(common_size), no_group.head(common_size)], ignore_index=True)

equal_samle_sized_df
Out[10]:
Hours Breakfast Tiredness_value EnoughSleep InPhoneReach UsedPhoneBeforeSleep Tiredness_value_answer
0 7 no breakfast 4 False True True yes
1 7 no breakfast 4 False True True yes
2 10 no breakfast 4 False True True yes
3 6 had breakfast 4 True True True yes
4 2 no breakfast 5 False True True yes
... ... ... ... ... ... ... ...
85 9 had breakfast 2 True False False no
86 7 had breakfast 2 True False True no
87 7 no breakfast 2 True True True no
88 7 had breakfast 2 False True True no
89 7 had breakfast 2 True True True no

90 rows × 7 columns

In [11]:
# one-way ANOVA: hours - dependent vars, categories of tiredness - categorical vars
groups_frame = pd.DataFrame({"Hours":equal_samle_sized_df['Hours'],"Tiredness":equal_samle_sized_df['Tiredness_value_answer']})
groups_by_category_of_tiredness = groups_frame.groupby("Tiredness")

yes_group_hours = groups_by_category_of_tiredness.get_group('yes')['Hours']
no_group_hours = groups_by_category_of_tiredness.get_group('no')['Hours']
maybe_group_hours = groups_by_category_of_tiredness.get_group('maybe')['Hours']

# P(0.3679482317252606) > 0.05. It means that here is no significant effect on sleep duration
print("one-way ANOVA:\n", stats.f_oneway(yes_group_hours, no_group_hours, maybe_group_hours), "\n")
one-way ANOVA:
 F_onewayResult(statistic=1.0113915177006665, pvalue=0.3679482317252606) 

In [12]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

# two-way ANOVA: hours - dependent vars; categories of tiredness, breakfast - categorical vars
groups_frame = pd.DataFrame({"Hours":equal_samle_sized_df['Hours'],
                             "Tiredness":equal_samle_sized_df['Tiredness_value_answer'], 
                             'Breakfast': equal_samle_sized_df['Breakfast']})

model = ols('Hours ~ C(Tiredness) + C(Breakfast) + C(Tiredness):C(Breakfast)', data=groups_frame).fit()

# P for C(Tiredness):C(Breakfast) > 0.05 - there is no significant interaction effect between Tiredness and Breakfast
# Both factors have no statistically significant effect on sleep duration (their p-values > 0.05)
print("two-way ANOVA:\n", sm.stats.anova_lm(model, type=2), "\n")

# p-value > 0.05 - data is drawn from normal distribution
print("Shapiro test\n", stats.shapiro(model.resid))

# As the standardized residuals lie around the 45-degree line, it suggests that the residuals are approximately normally distributed
# (even though shapiro test doesn't prove it)
import matplotlib.pyplot as plt
res = model.resid 
fig = sm.qqplot(res, stats.t, fit=True, line="45")
plt.show()
two-way ANOVA:
                              df      sum_sq   mean_sq         F    PR(>F)
C(Tiredness)                2.0    4.422222  2.211111  1.013771  0.367244
C(Breakfast)                1.0    6.854180  6.854180  3.142568  0.079899
C(Tiredness):C(Breakfast)   2.0    0.135412  0.067706  0.031043  0.969445
Residual                   84.0  183.210407  2.181076       NaN       NaN 

Shapiro test
 ShapiroResult(statistic=0.9766788482666016, pvalue=0.10531754046678543)
In [13]:
import seaborn as sns
import matplotlib.image as mpimg 

scatter_plot = sns.lmplot(x="Tiredness_value", y="Hours",  data=equal_samle_sized_df, fit_reg=False, hue='Breakfast', markers=["o", "x"]).set(title='The relation between being tired and hours of sleep')
image = mpimg.imread('../sleeping_beauty.png') 
plt.imshow(image, zorder=0, extent=[0.0, 6.0, 2.0, 11.0], aspect='auto', alpha=0.3)
plt.show()
In [17]:
fig = plt.figure(figsize=(10, 8))
ax = sns.boxplot(x='Tiredness_value_answer', y='Hours', data=equal_samle_sized_df, color='#ffccff').set(title='Sleep duration categorized by tiredness factor')
ax = sns.swarmplot(x="Tiredness_value_answer", y="Hours", data=equal_samle_sized_df, color='#660066', size=4)
plt.show()
In [ ]: